suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(splines))
rain <- fread("airport_data/IDCJAC0009_009021_1800_Data.csv")
rain <- rain %>% drop_na("Rainfall amount (millimetres)")
min_year = rain$Year %>% min
max_year = rain$Year %>% max
rainfall_totals <- rain %>% 
    group_by(Year, Month) %>% 
    summarise(total_rainfall = sum(`Rainfall amount (millimetres)`))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
rainfall_totals %>% 
    filter(Year != 1944, Year != 2022) %>% 
    ggplot(aes(Year, total_rainfall)) + 
    facet_grid(cols=vars(Month)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method="lm", colour = "red") +
    theme_classic() +
    theme(axis.text.x=element_text(angle=60,hjust=1)) +
    scale_x_continuous(breaks = c(min_year+1, max_year-1)) +
    ggtitle("Perth rainfall monthly totals for each month between 1945 and 2021")
## `geom_smooth()` using formula = 'y ~ x'

#ggsave("monthly_totals_regressed_year_as_predictor.png")
rainfall_totals %>% 
    filter(Year != 1944, Year != 2022) %>% 
    filter(Month > 4, Month <= 9) %>% 
    ggplot(aes(Year, total_rainfall)) + 
    facet_grid(cols=vars(Month)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method="loess", colour = "red") +
    theme_classic() +
    theme(axis.text.x=element_text(angle=60,hjust=1)) +
    scale_x_continuous(breaks = c(min_year+1, max_year-1)) +
    ggtitle("Perth rainfall monthly totals for each month between 1945 and 2021")
## `geom_smooth()` using formula = 'y ~ x'

rainfall_totals %>% 
    filter(Year != 1944, Year != 2022) %>% 
    filter(Month < 4 | Month > 10) %>% 
    ggplot(aes(Year, total_rainfall)) + 
    facet_grid(cols=vars(Month)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method="loess", colour = "red") +
    theme_classic() +
    theme(axis.text.x=element_text(angle=60,hjust=1)) +
    scale_x_continuous(breaks = c(min_year+1, max_year-1)) +
    ggtitle("Perth rainfall monthly totals for each month between 1945 and 2021")
## `geom_smooth()` using formula = 'y ~ x'

June vs July - years when july is wetter?

rainfall_totals_june_july_higher <- rainfall_totals %>% 
    filter(Month > 5, Month < 8) %>% 
    #cols become Year june july
    pivot_wider(names_from = c(Month), values_from = total_rainfall) %>% 
    mutate(higher_rain = case_when(
        `6` > `7` ~ "June",
        `7` > `6` ~ "July",
    )) %>% 
    mutate(higher_rain_ordinal = case_when(
        higher_rain == "June" ~ 0,
        higher_rain == "July" ~ 1
    )) %>% 
    mutate(delta = `7` - `6`)
rainfall_totals_june_july_higher %>% 
    ggplot(aes(Year, higher_rain_ordinal)) +
    geom_point()
## Warning: Removed 1 rows containing missing values (`geom_point()`).

rainfall_totals_june_july_higher %>% 
    ggplot(aes(Year, delta)) +
    geom_point() +
    geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.8, colour = "red") +
    geom_smooth(method="glm") +
    ggtitle("Difference in total rainfall in July compared to June (Perth Airport, BOM data)") +
    ylab("Difference (mm)") +
    theme_bw() +
    ylim(-400,400) +
    annotate("text", label = "July", x = 1980, y = 300, size = 8) +
    annotate("text", label = "June", x = 1980, y = -300, size = 8)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Removed 1 rows containing missing values (`geom_point()`).

#ggsave("Difference in total rainfall in July compared to June.png")
m.rainfall_totals_june_july_higher <- glm(delta ~ Year, data = rainfall_totals_june_july_higher)
summary(m.rainfall_totals_june_july_higher)
## 
## Call:
## glm(formula = delta ~ Year, data = rainfall_totals_june_july_higher)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -272.34   -46.08     1.90    44.35   403.26  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -1859.4794   923.1161  -2.014   0.0475 *
## Year            0.9385     0.4656   2.016   0.0474 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8571.591)
## 
##     Null deviance: 686268  on 77  degrees of freedom
## Residual deviance: 651441  on 76  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 931.71
## 
## Number of Fisher Scoring iterations: 2
rainfall_totals_july_august_higher <- rainfall_totals %>% 
    filter(Month > 6, Month < 9) %>% 
    #cols become Year june july
    pivot_wider(names_from = c(Month), values_from = total_rainfall) %>% 
    mutate(higher_rain = case_when(
        `7` > `8` ~ "July",
        `8` > `7` ~ "August"
    )) %>% 
    mutate(higher_rain_ordinal = case_when(
        higher_rain == "July" ~ 0,
        higher_rain == "August" ~ 1
    )) %>% 
    mutate(delta = `8` - `7`)

rainfall_totals_july_august_higher %>% 
    ggplot(aes(Year, delta)) +
    geom_point() +
    geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.8, colour = "red") +
    geom_smooth(method="glm") +
    ggtitle("Difference in total rainfall in August compared to July (Perth Airport, BOM data)") +
    ylab("Difference (mm)") +
    theme_bw() +
    ylim(-400,400) +
    annotate("text", label = "August", x = 1980, y = 300, size = 8) +
    annotate("text", label = "July", x = 1980, y = -300, size = 8)
## `geom_smooth()` using formula = 'y ~ x'

#ggsave("Difference in total rainfall in August compared to July")
m.rainfall_totals_july_august_higher <- glm(delta ~ Year, data = rainfall_totals_july_august_higher)
summary(m.rainfall_totals_july_august_higher)
## 
## Call:
## glm(formula = delta ~ Year, data = rainfall_totals_july_august_higher)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -351.30   -46.22    -3.07    52.36   225.07  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -750.8180   862.5827  -0.870    0.387
## Year           0.3597     0.4351   0.827    0.411
## 
## (Dispersion parameter for gaussian family taken to be 7484.283)
## 
##     Null deviance: 573920  on 77  degrees of freedom
## Residual deviance: 568806  on 76  degrees of freedom
## AIC: 921.13
## 
## Number of Fisher Scoring iterations: 2

To get stats for these 12 regressions means

Do some basic sensitivity tests:

test_july = rainfall_totals %>% filter(Month == 7)
july_rainfall_vs_year <- lm(total_rainfall ~ Year, data = test_july)
july_rainfall_vs_year %>% summary()
## 
## Call:
## lm(formula = total_rainfall ~ Year, data = test_july)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.487  -34.285   -1.097   32.599  278.202 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 1198.6345   577.9313   2.074   0.0415 *
## Year          -0.5261     0.2915  -1.805   0.0751 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.96 on 76 degrees of freedom
## Multiple R-squared:  0.0411, Adjusted R-squared:  0.02848 
## F-statistic: 3.258 on 1 and 76 DF,  p-value: 0.07506
plot(july_rainfall_vs_year)

test_dec = rainfall_totals %>% filter(Month == 12)
dec_rainfall_vs_year <- lm(total_rainfall ~ Year, data = test_dec)
dec_rainfall_vs_year %>% summary()
## 
## Call:
## lm(formula = total_rainfall ~ Year, data = test_dec)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.522  -7.378  -3.270   2.290  59.489 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 222.34162  131.43143   1.692   0.0948 .
## Year         -0.10663    0.06629  -1.608   0.1119  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.18 on 76 degrees of freedom
## Multiple R-squared:  0.03292,    Adjusted R-squared:  0.0202 
## F-statistic: 2.587 on 1 and 76 DF,  p-value: 0.1119
plot(dec_rainfall_vs_year)

predict(dec_rainfall_vs_year, list(Year = 2021), interval = "confidence")
##        fit       lwr      upr
## 1 6.844791 0.9561971 12.73338
summary(dec_rainfall_vs_year)$coefficients[2,]
##    Estimate  Std. Error     t value    Pr(>|t|) 
## -0.10662881  0.06629153 -1.60848322  0.11187595

test all months individually

I can make this set the intercept to the avg in 1945 by substracting 1945 from each year before doing the regression.

models.year_vs_rainfall_per_month <- rainfall_totals %>% 
    mutate(Year = Year - 1945) %>% 
    group_by(Month) %>% 
    group_modify(~ broom::tidy(lm(total_rainfall ~ Year, data = .)))
models.year_vs_rainfall_per_month
## # A tibble: 24 × 6
## # Groups:   Month [12]
##    Month term        estimate std.error statistic  p.value
##    <int> <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1     1 (Intercept)   2.37      4.37       0.542 5.90e- 1
##  2     1 Year          0.216     0.0979     2.20  3.07e- 2
##  3     2 (Intercept)  12.6       5.94       2.12  3.72e- 2
##  4     2 Year          0.0616    0.133      0.463 6.45e- 1
##  5     3 (Intercept)  13.3       3.83       3.48  8.41e- 4
##  6     3 Year          0.0758    0.0859     0.882 3.80e- 1
##  7     4 (Intercept)  48.8       6.56       7.44  1.32e-10
##  8     4 Year         -0.232     0.147     -1.57  1.20e- 1
##  9     5 (Intercept) 121.       10.8       11.2   8.15e-18
## 10     5 Year         -0.618     0.244     -2.53  1.34e- 2
## # … with 14 more rows

plot of beta estimates per month!

models.year_vs_rainfall_per_month %>% 
    filter(term == "Year") %>% 
    ggplot(aes(Month, estimate, ymin=estimate-std.error, ymax=estimate+std.error, colour = -log10(p.value))) + 
    theme_classic() +
    geom_point() +
    geom_pointrange() +
    ggtitle("Estimated change in average total monthly rainfall per year") +
    ylab("estimated change rainfall (mm)") +
    scale_x_continuous(breaks = 1:12)

#ggsave("Estimated_change_in_average_total_monthly_rainfall_per_year.png")

Plot of 1945 estimate of rainfall and 2021 estimate of rainfall totals?

  • 1945 is just intercept value.
  • 2021 will be 1945_total + (2021-1945)*beta_month
models.year_vs_rainfall_per_month %>% 
    filter(term == "(Intercept)") %>% 
    ggplot(aes(Month, estimate, ymin=estimate-std.error, ymax=estimate+std.error, colour = -log10(p.value))) + 
    geom_point() +
    geom_pointrange() +
    ggtitle("Estimate for mean rainfall at Perth Airport in 1945 (intercept estimates)") +
    theme_classic()

2021 estimates? - 2021 will be 1945_total + (2021-1945)*beta_month

models.year_vs_rainfall_per_month %>% 
    pivot_wider(names_from = c(term), values_from = c("estimate","std.error","statistic","p.value")) %>% 
    mutate(estimate_2021 = `estimate_(Intercept)` + 76*`estimate_Year`) %>% 
    ggplot(aes(Month, estimate_2021, colour = -log10(p.value_Year))) + 
    geom_point() +
    ggtitle("Estimate for mean rainfall at Perth Airport in 2021") +
    theme_classic()

Maybe easier to use predict() on model object to get 2021 estimates and std.err. I have re-done the regression without subtracting year as it is simpler to put in predict(year = 2021) than as predict(year = 2021-1945).

Years <- c(1945,1970,2021)
models.year_vs_rainfall_per_month.predictions.CI <- rainfall_totals %>% 
    group_by(Month) %>% 
    group_modify(
        ~ as.data.frame(predict(
                lm(total_rainfall ~ Year, data = .), list(Year = Years), interval = "confidence"
            ))) %>% 
    mutate(Year = Years)

models.year_vs_rainfall_per_month.predictions.CI %>% 
    ggplot(aes(Month, fit, colour = Year, group = Year)) + 
    geom_point() + geom_line() +
    theme_classic() +
    ggtitle("Monthly average total rainfall estimate predicted from per month model\nfor 1945, 1970, and 2021 (Perth Airport)") +
    ylab("Estimated rainfall total (mm)") +
    scale_x_continuous(breaks = 1:12)

#ggsave("monthly_total_rain_est_predicted_from_per_month_models.png")

Test month as predictor of rainfall total

Oh cool year is significant here! All/most months are significant too. Note January is baseline here. Hence all estimates are positive. Significance of other months depends on which month is used as the default month here. Since summer months are similar to each other it is unsurprising that they have small relative differences to each other.

#need Month as factor for this to work
rainfall_totals_cat_month <- rainfall_totals %>% 
    mutate(Month = as.factor(Month)) %>% 
    mutate(Month = fct_relevel(Month, as.character(1:12)))
month_predict_rainfall <- lm(total_rainfall ~ Month + Year, data = rainfall_totals_cat_month)
summary(month_predict_rainfall)
## 
## Call:
## lm(formula = total_rainfall ~ Month + Year, data = rainfall_totals_cat_month)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -141.718  -17.390   -4.492   16.877  284.929 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 509.67014  112.51788   4.530 6.68e-06 ***
## Month2        4.30256    6.27236   0.686   0.4929    
## Month3        5.57436    6.27236   0.889   0.3744    
## Month4       29.21923    6.27236   4.658 3.65e-06 ***
## Month5       86.68607    6.25255  13.864  < 2e-16 ***
## Month6      141.74050    6.25255  22.669  < 2e-16 ***
## Month7      144.69329    6.27262  23.067  < 2e-16 ***
## Month8      106.90996    6.27262  17.044  < 2e-16 ***
## Month9       61.29329    6.27262   9.772  < 2e-16 ***
## Month10      32.39201    6.27262   5.164 2.96e-07 ***
## Month11      15.08047    6.27262   2.404   0.0164 *  
## Month12       0.03560    6.27262   0.006   0.9955    
## Year         -0.25158    0.05668  -4.438 1.02e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.17 on 925 degrees of freedom
## Multiple R-squared:  0.6488, Adjusted R-squared:  0.6442 
## F-statistic: 142.4 on 12 and 925 DF,  p-value: < 2.2e-16

LOL this looks awful! 🤣 I guess Month as a categorical variable is problematic here.

plot(month_predict_rainfall)

Try month as a numeric value? Well this returns a similar value for Year at least. Diagnostic plots still show assumptions very violated.

month_predict_rainfall_monthnumeric <- lm(total_rainfall ~ Month + Year, data = rainfall_totals)
summary(month_predict_rainfall_monthnumeric)
## 
## Call:
## lm(formula = total_rainfall ~ Month + Year, data = rainfall_totals)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -81.97 -49.80 -23.40  36.58 375.96 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 575.9180   187.1259   3.078  0.00215 **
## Month         1.7696     0.6171   2.868  0.00423 **
## Year         -0.2643     0.0943  -2.803  0.00517 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.17 on 935 degrees of freedom
## Multiple R-squared:  0.01721,    Adjusted R-squared:  0.01511 
## F-statistic: 8.188 on 2 and 935 DF,  p-value: 0.0002983
plot(month_predict_rainfall_monthnumeric)

rainy days per day?

rain <- rain %>% 
    mutate(did_rain = `Rainfall amount (millimetres)` > 0)

rainy_days <- rain %>% 
    group_by(Month, Day) %>% 
    summarise(number_of_rainy_days = sum(did_rain), days_observed = n()) %>% 
    mutate(proportion_rained = number_of_rainy_days/days_observed)
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
rainy_days
## # A tibble: 366 × 5
## # Groups:   Month [12]
##    Month   Day number_of_rainy_days days_observed proportion_rained
##    <int> <int>                <int>         <int>             <dbl>
##  1     1     1                    5            78            0.0641
##  2     1     2                    4            78            0.0513
##  3     1     3                    6            78            0.0769
##  4     1     4                    3            78            0.0385
##  5     1     5                    8            78            0.103 
##  6     1     6                    4            78            0.0513
##  7     1     7                    6            78            0.0769
##  8     1     8                    8            78            0.103 
##  9     1     9                    5            78            0.0641
## 10     1    10                    7            78            0.0897
## # … with 356 more rows
rainy_days %>% 
    ggplot(aes(x=Day, y=proportion_rained)) +
    geom_point() + facet_grid(cols=vars(Month), space="free") +
    theme_classic() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

How can I get this to go across months continously?

rainy_days %>% 
    ggplot(aes(x=Month, y=proportion_rained, group = factor(Month))) +
    geom_boxplot()

rain %>% 
    filter(Month == 6) %>% 
    ggplot(aes(Day, `Rainfall amount (millimetres)`)) +
    geom_point()

rainy_days %>% 
    filter(Month == 6) %>% 
    ggplot(aes(Day, proportion_rained)) +
    geom_point()

rainy_days %>% 
    filter(Month == 7) %>% 
    ggplot(aes(Day, proportion_rained)) +
    geom_point()

rainy_days %>% 
    filter(Month == 8) %>% 
    ggplot(aes(Day, proportion_rained)) +
    geom_point()

Combine month and day as a single axis? Yes, can do this as a simple index.

Can then add days and month as x axis?

rainy_days
## # A tibble: 366 × 5
## # Groups:   Month [12]
##    Month   Day number_of_rainy_days days_observed proportion_rained
##    <int> <int>                <int>         <int>             <dbl>
##  1     1     1                    5            78            0.0641
##  2     1     2                    4            78            0.0513
##  3     1     3                    6            78            0.0769
##  4     1     4                    3            78            0.0385
##  5     1     5                    8            78            0.103 
##  6     1     6                    4            78            0.0513
##  7     1     7                    6            78            0.0769
##  8     1     8                    8            78            0.103 
##  9     1     9                    5            78            0.0641
## 10     1    10                    7            78            0.0897
## # … with 356 more rows
rainy_days$index <- 1:length(rainy_days$Day)
rainy_days <- rainy_days %>% mutate(date_str = str_c(Day, Month, sep = "/"))
#custom_breaks = seq(1,366,31)
custom_breaks = c(
    1,32,32+29,32+29+31,92+30,153,153+30,214,245,275,306,336,366
)
rainy_days %>% 
    ggplot(aes(index, proportion_rained*100)) +
    geom_point() +
    geom_smooth(span=0.5) +
    scale_y_continuous(breaks = seq(0,1,0.1)*100) +
    ylab("Percentage of days with any rain") +
    xlab("Date") +
    scale_x_continuous(breaks = custom_breaks, labels = rainy_days$date_str[custom_breaks]) +
    ggtitle("Should you plan to go outside? Percent chance of any rain for Perth Airport") +
    theme_bw()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#ggsave("Percent chance of any rain for Perth Airport.png", width = 10, height = 7)
m.loess <- loess(proportion_rained ~ index, data = rainy_days, span=0.5)


X <- rainy_days$index
Y <- predict(m.loess, X)
plot(X, rainy_days$proportion_rained, xlab = "day", ylab = "proportion (0 to 1)", main="proportion of rainy days (Perth Airport)")
points(X,Y, type = "l")

dY <- diff(Y)/diff(X)
dX <- rowMeans(embed(X,2))

plot(dX, abs(dY), type="l", xlab = "day", ylab = "absolute rate of change", main = "rate of change in proportion of rainy days (Perth Airport)")